accelerate launch --config_file recipes/accelerate_configs/zero3.yaml --num_processes=3 src/open_r1/grpo.py \
    --output_dir DeepSeek-R1-Distill-Qwen-15B-GRPO \
    --model_name_or_path Qwen25-15B-Instruct \
    --dataset_name data/gsm8k/main \
    --max_prompt_length 512 \
    --max_completion_length 1024 \
    --per_device_train_batch_size 1 \
    --num_generations 3 \
    --lr_scheduler_type cosine \
    --warmup_ratio 0.1 \
    --logging_strategy steps \
    --learning_rate 3.0e-06 \
    --gradient_accumulation_steps 16 \
    --logging_steps 10 \
    --eval_strategy no \
    --bf16 \
    --use_vllm \
    --vllm_device auto \
    --vllm_gpu_memory_utilization 0.7
